A description of the data, where it came from, and what it contains.
train <- read.csv("Data/train.csv")
test <- read.csv("Data/test.csv")
# Merge the data frames and add a column indicating whether they come from the train or test set
train$train <- 1
test$SalePrice <- NA
test$train <- 0
ames <- rbind(train, test)
# Verify data frame
head(ames)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice train
## 1 0 2 2008 WD Normal 208500 1
## 2 0 5 2007 WD Normal 181500 1
## 3 0 9 2008 WD Normal 223500 1
## 4 0 2 2006 WD Abnorml 140000 1
## 5 0 12 2008 WD Normal 250000 1
## 6 700 10 2009 WD Normal 143000 1
str(ames)
## 'data.frame': 2919 obs. of 82 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## $ train : num 1 1 1 1 1 1 1 1 1 1 ...
summary(ames)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.00 Length:2919 Min. : 21.00
## 1st Qu.: 730.5 1st Qu.: 20.00 Class :character 1st Qu.: 59.00
## Median :1460.0 Median : 50.00 Mode :character Median : 68.00
## Mean :1460.0 Mean : 57.14 Mean : 69.31
## 3rd Qu.:2189.5 3rd Qu.: 70.00 3rd Qu.: 80.00
## Max. :2919.0 Max. :190.00 Max. :313.00
## NA's :486
## LotArea Street Alley LotShape
## Min. : 1300 Length:2919 Length:2919 Length:2919
## 1st Qu.: 7478 Class :character Class :character Class :character
## Median : 9453 Mode :character Mode :character Mode :character
## Mean : 10168
## 3rd Qu.: 11570
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:2919 Length:2919 Length:2919 Length:2919
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:2919 Length:2919 Length:2919 Length:2919
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:2919 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.089 Mean :5.565 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2001
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:2919 Length:2919 Length:2919
## 1st Qu.:1965 Class :character Class :character Class :character
## Median :1993 Mode :character Mode :character Mode :character
## Mean :1984
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:2919 Length:2919 Min. : 0.0 Length:2919
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 102.2
## 3rd Qu.: 164.0
## Max. :1600.0
## NA's :23
## ExterCond Foundation BsmtQual BsmtCond
## Length:2919 Length:2919 Length:2919 Length:2919
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:2919 Length:2919 Min. : 0.0 Length:2919
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 368.5 Mode :character
## Mean : 441.4
## 3rd Qu.: 733.0
## Max. :5644.0
## NA's :1
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:2919
## 1st Qu.: 0.00 1st Qu.: 220.0 1st Qu.: 793.0 Class :character
## Median : 0.00 Median : 467.0 Median : 989.5 Mode :character
## Mean : 49.58 Mean : 560.8 Mean :1051.8
## 3rd Qu.: 0.00 3rd Qu.: 805.5 3rd Qu.:1302.0
## Max. :1526.00 Max. :2336.0 Max. :6110.0
## NA's :1 NA's :1 NA's :1
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:2919 Length:2919 Length:2919 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 876
## Mode :character Mode :character Mode :character Median :1082
## Mean :1160
## 3rd Qu.:1388
## Max. :5095
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0.0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.:1126 1st Qu.:0.0000
## Median : 0.0 Median : 0.000 Median :1444 Median :0.0000
## Mean : 336.5 Mean : 4.694 Mean :1501 Mean :0.4299
## 3rd Qu.: 704.0 3rd Qu.: 0.000 3rd Qu.:1744 3rd Qu.:1.0000
## Max. :2065.0 Max. :1064.000 Max. :5642 Max. :3.0000
## NA's :2
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.00
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.00
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.00
## Mean :0.06136 Mean :1.568 Mean :0.3803 Mean :2.86
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.00
## Max. :2.00000 Max. :4.000 Max. :2.0000 Max. :8.00
## NA's :2
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:2919 Min. : 2.000 Length:2919
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.045 Mean : 6.452
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :15.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.0000 Length:2919 Length:2919 Min. :1895
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:1960
## Median :1.0000 Mode :character Mode :character Median :1979
## Mean :0.5971 Mean :1978
## 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :4.0000 Max. :2207
## NA's :159
## GarageFinish GarageCars GarageArea GarageQual
## Length:2919 Min. :0.000 Min. : 0.0 Length:2919
## Class :character 1st Qu.:1.000 1st Qu.: 320.0 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 472.9
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :5.000 Max. :1488.0
## NA's :1 NA's :1
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:2919 Length:2919 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 26.00
## Mean : 93.71 Mean : 47.49
## 3rd Qu.: 168.00 3rd Qu.: 70.00
## Max. :1424.00 Max. :742.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.0 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.0 Median : 0.000 Median : 0.00 Median : 0.000
## Mean : 23.1 Mean : 2.602 Mean : 16.06 Mean : 2.252
## 3rd Qu.: 0.0 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :1012.0 Max. :508.000 Max. :576.00 Max. :800.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:2919 Length:2919 Length:2919 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 50.83
## 3rd Qu.: 0.00
## Max. :17000.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:2919 Length:2919
## 1st Qu.: 4.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.213 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice train
## Min. : 34900 Min. :0.0000
## 1st Qu.:129975 1st Qu.:0.0000
## Median :163000 Median :1.0000
## Mean :180921 Mean :0.5002
## 3rd Qu.:214000 3rd Qu.:1.0000
## Max. :755000 Max. :1.0000
## NA's :1459
For data cleaning purposes, we will merge test and train into one dataset, keeping in mind that the 1459 NA’s in the SalePrice column are from the test set. We will also add a column to indicate whether the row is from the train or test set.
Next, we will start exploring the data for insights into the Ames housing market as well as what variables we need to change, clean, or create.
First, we will visualize where the NA values are and whether it will affect the first to parameters we are concerned about: Sale Price and Gross Living Area.
# Summarize NA's by column
ames %>%
summarise_all(funs(sum(is.na(.)))) %>%
gather(key = "Column", value = "NA_Count", -1) %>%
filter(NA_Count > 0) %>%
ggplot(aes(x = reorder(Column, NA_Count), y = NA_Count)) +
geom_col() +
coord_flip() +
theme_gdocs() +
labs(title = "Number of NA's by Column", x = "Column", y = "NA Count")
# Create a table of the missing NAs by column
ames %>%
summarise_all(funs(sum(is.na(.)))) %>%
gather(key = "Column", value = "NA_Count", -1) %>%
filter(NA_Count > 0) %>%
arrange(desc(NA_Count)) %>%
kable()
| Id | Column | NA_Count |
|---|---|---|
| 0 | PoolQC | 2909 |
| 0 | MiscFeature | 2814 |
| 0 | Alley | 2721 |
| 0 | Fence | 2348 |
| 0 | SalePrice | 1459 |
| 0 | FireplaceQu | 1420 |
| 0 | LotFrontage | 486 |
| 0 | GarageYrBlt | 159 |
| 0 | GarageFinish | 159 |
| 0 | GarageQual | 159 |
| 0 | GarageCond | 159 |
| 0 | GarageType | 157 |
| 0 | BsmtCond | 82 |
| 0 | BsmtExposure | 82 |
| 0 | BsmtQual | 81 |
| 0 | BsmtFinType2 | 80 |
| 0 | BsmtFinType1 | 79 |
| 0 | MasVnrType | 24 |
| 0 | MasVnrArea | 23 |
| 0 | MSZoning | 4 |
| 0 | Utilities | 2 |
| 0 | BsmtFullBath | 2 |
| 0 | BsmtHalfBath | 2 |
| 0 | Functional | 2 |
| 0 | Exterior1st | 1 |
| 0 | Exterior2nd | 1 |
| 0 | BsmtFinSF1 | 1 |
| 0 | BsmtFinSF2 | 1 |
| 0 | BsmtUnfSF | 1 |
| 0 | TotalBsmtSF | 1 |
| 0 | Electrical | 1 |
| 0 | KitchenQual | 1 |
| 0 | GarageCars | 1 |
| 0 | GarageArea | 1 |
| 0 | SaleType | 1 |
There are not too many NA’s in the data set, and they appear mostly to do with lack of a certain feature. For example, if a house does not have a pool, then the PoolQC column will be NA. We will need to decide how to handle these NA’s, but for now we will leave them as is and continue with the analysis of Sale Price and Gross Living Area.
Restate the problem here
# Plot Sale Price vs. Gross Living Area colored by neighborhood, omitting rows where SalePrice is NA
ames %>%
filter(!is.na(SalePrice)) %>%
ggplot(aes(x = GrLivArea, y = SalePrice, color = Neighborhood)) +
geom_point() +
theme_gdocs() +
labs(title = "Sale Price vs. Gross Living Area by Neighborhood", x = "Gross Living Area", y = "Sale Price")
There is clearly a relationship between Sale Price and Gross Living
Area, and the neighborhoods appear to have a similar relationship. We
will now look at log transformations of the data to see if there is more
linear relationship.
# Plot log(Sale Price) vs. log(Gross Living Area) colored by neighborhood, omitting rows where SalePrice is NA
ames %>%
filter(!is.na(SalePrice)) %>%
ggplot(aes(x = log(GrLivArea), y = log(SalePrice), color = Neighborhood)) +
geom_point() +
theme_gdocs() +
labs(
title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
x = "log(Gross Living Area)",
y = "log(Sale Price)"
)
This relationship appears to be more linear. We will create columns for
the log of Sale Price and Gross Living Area and use these in our
analysis.
# Create columns for log(SalePrice) and log(GrLivArea)
ames$logSalePrice <- log(ames$SalePrice)
ames$logGrLivArea <- log(ames$GrLivArea)
PRESS <- function(linear.model) {
#' calculate the predictive residuals
pr <- residuals(linear.model) / (1 - lm.influence(linear.model)$hat)
#' calculate the PRESS
PRESS <- sum(pr^2)
return(PRESS)
}
# Function for calculating PRESS
# Tom Hopper
# https://gist.github.com/tomhopper/8c204d978c4a0cbcb8c0
^Just some code for CV PRESS and log transformations. Jae: What do you think about using AIC rather than PRESS? The models are optimized using AIC.
Next, we will visualize the relationship between log Sale Price and log Gross Living Area for the neighborhoods that Century21 operates in: NAmes, Edwards and BrkSide.
# Plot log(Sale Price) vs. log(Gross Living Area) colored by neighborhood, omitting rows where SalePrice is NA for only the neighborhoods of interest
century21 <-
ames %>%
filter(!is.na(SalePrice)) %>%
filter(Neighborhood %in% c("NAmes", "Edwards", "BrkSide"))
century21 %>%
ggplot(aes(x = logGrLivArea, y = logSalePrice, color = Neighborhood)) +
geom_point() +
theme_gdocs() +
labs(
title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
x = "log(Gross Living Area)",
y = "log(Sale Price)"
)
The relationship appears to be linear, so we will fit a linear model using this data and asses whether it describes the Sale Prices accurately.
# Fit a linear model to the data
fit1x <- lm(logSalePrice ~ logGrLivArea + Neighborhood, data = century21)
summary(fit1x)
##
## Call:
## lm(formula = logSalePrice ~ logGrLivArea + Neighborhood, data = century21)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72154 -0.10592 0.02469 0.11565 0.79364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.76936 0.22919 33.900 < 2e-16 ***
## logGrLivArea 0.55579 0.03237 17.171 < 2e-16 ***
## NeighborhoodEdwards -0.02044 0.03252 -0.629 0.53
## NeighborhoodNAmes 0.13279 0.02906 4.569 6.63e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1961 on 379 degrees of freedom
## Multiple R-squared: 0.4897, Adjusted R-squared: 0.4857
## F-statistic: 121.2 on 3 and 379 DF, p-value: < 2.2e-16
PRESS(fit1x)
## [1] 15.00131
# Fit a linear model to the data with interaction variables
fit1 <- lm(logSalePrice ~ logGrLivArea * Neighborhood, data = century21)
summary(fit1)
##
## Call:
## lm(formula = logSalePrice ~ logGrLivArea * Neighborhood, data = century21)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72080 -0.10353 0.02184 0.10586 0.80470
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.91292 0.50459 11.718 < 2e-16 ***
## logGrLivArea 0.81965 0.07163 11.443 < 2e-16 ***
## NeighborhoodEdwards 2.09359 0.64589 3.241 0.0013 **
## NeighborhoodNAmes 2.57981 0.59988 4.301 2.17e-05 ***
## logGrLivArea:NeighborhoodEdwards -0.29998 0.09122 -3.289 0.0011 **
## logGrLivArea:NeighborhoodNAmes -0.34662 0.08482 -4.087 5.35e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1923 on 377 degrees of freedom
## Multiple R-squared: 0.5121, Adjusted R-squared: 0.5056
## F-statistic: 79.14 on 5 and 377 DF, p-value: < 2.2e-16
PRESS(fit1)
## [1] 14.60908
confint(fit1) %>% kable()
| 2.5 % | 97.5 % | |
|---|---|---|
| (Intercept) | 4.9207572 | 6.9050843 |
| logGrLivArea | 0.6788064 | 0.9604897 |
| NeighborhoodEdwards | 0.8235795 | 3.3635933 |
| NeighborhoodNAmes | 1.4002744 | 3.7593394 |
| logGrLivArea:NeighborhoodEdwards | -0.4793353 | -0.1206263 |
| logGrLivArea:NeighborhoodNAmes | -0.5134042 | -0.1798447 |
# Plot the data with the linear model superposed
century21 %>%
ggplot(aes(x = logGrLivArea, y = logSalePrice, color = Neighborhood)) +
geom_point() +
theme_gdocs() +
labs(
title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
x = "log(Gross Living Area)",
y = "log(Sale Price)"
) +
geom_smooth(
method = "lm", formula = y ~ x, se = FALSE, linewidth = 1,
data = data.frame(
logGrLivArea = century21$logGrLivArea,
Neighborhood = century21$Neighborhood,
logSalePrice = predict(fit1)
)
)
# # Print parameter estimate table nicely. Not working, needs debugging
# fit1 %>%
# summary() %>%
# {cbind(as.data.frame(coef(.)), .[["coefficients"]][, 2:4])} %>%
# setNames(c("Estimate", "Std. Error", "t-value", "Pr(>|t|)")) %>%
# rownames_to_column(var = "Term") %>%
# mutate(Term = ifelse(Term == "(Intercept)", "Intercept", Term)) %>%
# add_row(Term = "Adjusted R-squared", Estimate = round(.$adj.r.squared, 3), Std..Error = NA, `t-value` = NA, `Pr(>|t|)` = NA) %>%
# kable(digits = 3, align = "c") %>%
# kable_styling(full_width = FALSE)
We fit models with and without interaction terms. The interaction terms are statistically significant (include p values) and the model with interaction terms has a lower PRESS. This means that the relationship between log sale price and log gross living area is different for each neighborhood.
It might be good to test whether Edwards and NAmes are are statistically different. We could do that by re-referencing or doing the BYOA method.
# Plot the studentized residuals using base R
plot(fit1$fitted.values, fit1$residuals, type = "p")
plot(fit1$residuals)
# Decide which of these we like better ggplot or R
# Calculate studentized residuals
stud_res <- rstudent(fit1)
# Create a data frame with the studentized residuals
df <- data.frame(stud_res, logGrLivArea = model.frame(fit1)$logGrLivArea)
# Create a scatterplot of the studentized residuals
ggplot(df, aes(x = logGrLivArea, y = stud_res)) +
geom_point() +
labs(title = "Scatterplot of Studentized Residuals",
x = "Studentized Residuals",
y = "Frequency") +
theme_minimal()
# Create histogram with normal curve
ggplot(df, aes(x = stud_res)) +
geom_histogram(aes(y = ..density..), binwidth = 0.5, fill = "lightblue", color = "black") +
stat_function(fun = dnorm, args = list(mean = mean(df$stud_res), sd = sd(df$stud_res)), color = "blue", size = 1.2) +
labs(title = "Histogram of Studentized Residuals with Normal Curve",
x = "Studentized Residuals",
y = "Frequency") +
theme_minimal()
The plots show the same thing, please include whichever style you think
looks better. There is no evidence of non-linearity, heteroscedasticity,
or non-normality.
# Plot the residuals vs. the fitted values
fit1 %>%
plot()
There are no influential points that need to be investigated. The
results are consistent with the assumptions of the linear model.
This is already done, just need to move it here. Adj R2, CV press
Estimates, interpretation, confidence
A short summary of the analysis
Either embed the app here or link to it
Restate the problem
In order to use a linear regression model, we need to convert all of the categorical variables into dummy variables. We will also remove or impute the NA’s in the data set. Please see the appendix for details on this process.
# Data Cleaning
# If pool-related variables are NA, assume there is no pool and assign to 0
ames <- ames %>%
mutate(
PoolQC = ifelse(is.na(PoolQC), "None", PoolQC),
PoolArea = ifelse(is.na(PoolArea), 0, PoolArea),
)
# If garage-related variables are NA, assume there is no garage and assign to 0
ames <- ames %>%
mutate(
GarageType = ifelse(is.na(GarageType), "None", GarageType),
#GarageYrBlt = ifelse(is.na(GarageYrBlt), 0, GarageYrBlt), #These will be changed to the mean because of large year values
GarageFinish = ifelse(is.na(GarageFinish), "None", GarageFinish),
GarageCars = ifelse(is.na(GarageCars), 0, GarageCars),
GarageArea = ifelse(is.na(GarageArea), 0, GarageArea),
GarageQual = ifelse(is.na(GarageQual), "None", GarageQual),
GarageCond = ifelse(is.na(GarageCond), "None", GarageCond)
)
# If Bsmt-related variables are NA, assume there is no Bsmt and assign to 0
ames <- ames %>%
mutate(
BsmtQual = ifelse(is.na(BsmtQual), "None", BsmtQual),
BsmtCond = ifelse(is.na(BsmtCond), "None", BsmtCond),
BsmtExposure = ifelse(is.na(BsmtExposure), "None", BsmtExposure),
BsmtFinType1 = ifelse(is.na(BsmtFinType1), "None", BsmtFinType1),
BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), 0, BsmtFinSF1),
BsmtFinType2 = ifelse(is.na(BsmtFinType2), "None", BsmtFinType2),
BsmtFinSF2 = ifelse(is.na(BsmtFinSF2), 0, BsmtFinSF2),
BsmtUnfSF = ifelse(is.na(BsmtUnfSF), 0, BsmtUnfSF),
TotalBsmtSF = ifelse(is.na(TotalBsmtSF), 0, TotalBsmtSF)
)
# If Fence-related variables are NA, assume there is no Fence and assign to 0
ames <- ames %>%
mutate(
Fence = ifelse(is.na(Fence), "None", Fence),
)
# If Misc-related variables are NA, assume there is no Misc features and assign to 0
ames <- ames %>%
mutate(
MiscFeature = ifelse(is.na(MiscFeature), "None", MiscFeature),
)
# If Fireplace-related variables are NA, assume there is no Fireplace and assign to 0
ames <- ames %>%
mutate(
FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
)
# If Alley-related variables are NA, assume there is no Alley and assign to 0
ames <- ames %>%
mutate(
Alley = ifelse(is.na(Alley), "None", Alley),
)
# Summarize the amount of remaining NA's by column to check what's left
colSums(is.na(ames))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 4 486 0
## Street Alley LotShape LandContour Utilities
## 0 0 0 0 2
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 1 1
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 24 23 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 0 0 0 0 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 0 0 0 0 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 2 2 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 1 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 2 0 0 0 159
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 0 0 0 0 0
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 0 0 0
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 1 0
## SalePrice train logSalePrice logGrLivArea
## 1459 0 1459 0
# Use the dummyVars() function to convert categorical variables into dummy variables
# Then use janitor::clean_names() to clean up the column names
dummy_model <- dummyVars(~ ., data = ames)
ames_dummy <- as.data.frame(predict(dummy_model, newdata = ames))
ames_dummy <- clean_names(ames_dummy)
# Fill in all remaining na values with the mean of the column
ames_dummy <- ames_dummy %>%
mutate(across(
c(-sale_price, -log_sale_price),
~ ifelse(is.na(.), mean(., na.rm = TRUE), .)
))
# Summarize the amount of remaining NA's by column
colSums(is.na(ames_dummy))
## id ms_sub_class ms_zoning_c_all
## 0 0 0
## ms_zoning_fv ms_zoning_rh ms_zoning_rl
## 0 0 0
## ms_zoning_rm lot_frontage lot_area
## 0 0 0
## street_grvl street_pave alley_grvl
## 0 0 0
## alley_none alley_pave lot_shape_ir1
## 0 0 0
## lot_shape_ir2 lot_shape_ir3 lot_shape_reg
## 0 0 0
## land_contour_bnk land_contour_hls land_contour_low
## 0 0 0
## land_contour_lvl utilities_all_pub utilities_no_se_wa
## 0 0 0
## lot_config_corner lot_config_cul_d_sac lot_config_fr2
## 0 0 0
## lot_config_fr3 lot_config_inside land_slope_gtl
## 0 0 0
## land_slope_mod land_slope_sev neighborhood_blmngtn
## 0 0 0
## neighborhood_blueste neighborhood_br_dale neighborhood_brk_side
## 0 0 0
## neighborhood_clear_cr neighborhood_collg_cr neighborhood_crawfor
## 0 0 0
## neighborhood_edwards neighborhood_gilbert neighborhood_idotrr
## 0 0 0
## neighborhood_meadow_v neighborhood_mitchel neighborhood_n_ames
## 0 0 0
## neighborhood_no_ridge neighborhood_n_pk_vill neighborhood_nridg_ht
## 0 0 0
## neighborhood_nw_ames neighborhood_old_town neighborhood_sawyer
## 0 0 0
## neighborhood_sawyer_w neighborhood_somerst neighborhood_stone_br
## 0 0 0
## neighborhood_swisu neighborhood_timber neighborhood_veenker
## 0 0 0
## condition1artery condition1feedr condition1norm
## 0 0 0
## condition1pos_a condition1pos_n condition1rr_ae
## 0 0 0
## condition1rr_an condition1rr_ne condition1rr_nn
## 0 0 0
## condition2artery condition2feedr condition2norm
## 0 0 0
## condition2pos_a condition2pos_n condition2rr_ae
## 0 0 0
## condition2rr_an condition2rr_nn bldg_type1fam
## 0 0 0
## bldg_type2fm_con bldg_type_duplex bldg_type_twnhs
## 0 0 0
## bldg_type_twnhs_e house_style1_5fin house_style1_5unf
## 0 0 0
## house_style1story house_style2_5fin house_style2_5unf
## 0 0 0
## house_style2story house_style_s_foyer house_style_s_lvl
## 0 0 0
## overall_qual overall_cond year_built
## 0 0 0
## year_remod_add roof_style_flat roof_style_gable
## 0 0 0
## roof_style_gambrel roof_style_hip roof_style_mansard
## 0 0 0
## roof_style_shed roof_matl_cly_tile roof_matl_comp_shg
## 0 0 0
## roof_matl_membran roof_matl_metal roof_matl_roll
## 0 0 0
## roof_matl_tar_grv roof_matl_wd_shake roof_matl_wd_shngl
## 0 0 0
## exterior1st_asb_shng exterior1st_asph_shn exterior1st_brk_comm
## 0 0 0
## exterior1st_brk_face exterior1st_c_block exterior1st_cemnt_bd
## 0 0 0
## exterior1st_hd_board exterior1st_im_stucc exterior1st_metal_sd
## 0 0 0
## exterior1st_plywood exterior1st_stone exterior1st_stucco
## 0 0 0
## exterior1st_vinyl_sd exterior1st_wd_sdng exterior1st_wd_shing
## 0 0 0
## exterior2nd_asb_shng exterior2nd_asph_shn exterior2nd_brk_cmn
## 0 0 0
## exterior2nd_brk_face exterior2nd_c_block exterior2nd_cment_bd
## 0 0 0
## exterior2nd_hd_board exterior2nd_im_stucc exterior2nd_metal_sd
## 0 0 0
## exterior2nd_other exterior2nd_plywood exterior2nd_stone
## 0 0 0
## exterior2nd_stucco exterior2nd_vinyl_sd exterior2nd_wd_sdng
## 0 0 0
## exterior2nd_wd_shng mas_vnr_type_brk_cmn mas_vnr_type_brk_face
## 0 0 0
## mas_vnr_type_none mas_vnr_type_stone mas_vnr_area
## 0 0 0
## exter_qual_ex exter_qual_fa exter_qual_gd
## 0 0 0
## exter_qual_ta exter_cond_ex exter_cond_fa
## 0 0 0
## exter_cond_gd exter_cond_po exter_cond_ta
## 0 0 0
## foundation_brk_til foundation_c_block foundation_p_conc
## 0 0 0
## foundation_slab foundation_stone foundation_wood
## 0 0 0
## bsmt_qual_ex bsmt_qual_fa bsmt_qual_gd
## 0 0 0
## bsmt_qual_none bsmt_qual_ta bsmt_cond_fa
## 0 0 0
## bsmt_cond_gd bsmt_cond_none bsmt_cond_po
## 0 0 0
## bsmt_cond_ta bsmt_exposure_av bsmt_exposure_gd
## 0 0 0
## bsmt_exposure_mn bsmt_exposure_no bsmt_exposure_none
## 0 0 0
## bsmt_fin_type1alq bsmt_fin_type1blq bsmt_fin_type1glq
## 0 0 0
## bsmt_fin_type1lw_q bsmt_fin_type1none bsmt_fin_type1rec
## 0 0 0
## bsmt_fin_type1unf bsmt_fin_sf1 bsmt_fin_type2alq
## 0 0 0
## bsmt_fin_type2blq bsmt_fin_type2glq bsmt_fin_type2lw_q
## 0 0 0
## bsmt_fin_type2none bsmt_fin_type2rec bsmt_fin_type2unf
## 0 0 0
## bsmt_fin_sf2 bsmt_unf_sf total_bsmt_sf
## 0 0 0
## heating_floor heating_gas_a heating_gas_w
## 0 0 0
## heating_grav heating_oth_w heating_wall
## 0 0 0
## heating_qc_ex heating_qc_fa heating_qc_gd
## 0 0 0
## heating_qc_po heating_qcta central_air_n
## 0 0 0
## central_air_y electrical_fuse_a electrical_fuse_f
## 0 0 0
## electrical_fuse_p electrical_mix electrical_s_brkr
## 0 0 0
## x1st_flr_sf x2nd_flr_sf low_qual_fin_sf
## 0 0 0
## gr_liv_area bsmt_full_bath bsmt_half_bath
## 0 0 0
## full_bath half_bath bedroom_abv_gr
## 0 0 0
## kitchen_abv_gr kitchen_qual_ex kitchen_qual_fa
## 0 0 0
## kitchen_qual_gd kitchen_qual_ta tot_rms_abv_grd
## 0 0 0
## functional_maj1 functional_maj2 functional_min1
## 0 0 0
## functional_min2 functional_mod functional_sev
## 0 0 0
## functional_typ fireplaces fireplace_qu_ex
## 0 0 0
## fireplace_qu_fa fireplace_qu_gd fireplace_qu_none
## 0 0 0
## fireplace_qu_po fireplace_qu_ta garage_type2types
## 0 0 0
## garage_type_attchd garage_type_basment garage_type_built_in
## 0 0 0
## garage_type_car_port garage_type_detchd garage_type_none
## 0 0 0
## garage_yr_blt garage_finish_fin garage_finish_none
## 0 0 0
## garage_finish_r_fn garage_finish_unf garage_cars
## 0 0 0
## garage_area garage_qual_ex garage_qual_fa
## 0 0 0
## garage_qual_gd garage_qual_none garage_qual_po
## 0 0 0
## garage_qual_ta garage_cond_ex garage_cond_fa
## 0 0 0
## garage_cond_gd garage_cond_none garage_cond_po
## 0 0 0
## garage_cond_ta paved_drive_n paved_drive_p
## 0 0 0
## paved_drive_y wood_deck_sf open_porch_sf
## 0 0 0
## enclosed_porch x3ssn_porch screen_porch
## 0 0 0
## pool_area pool_qc_ex pool_qc_fa
## 0 0 0
## pool_qc_gd pool_qc_none fence_gd_prv
## 0 0 0
## fence_gd_wo fence_mn_prv fence_mn_ww
## 0 0 0
## fence_none misc_feature_gar2 misc_feature_none
## 0 0 0
## misc_feature_othr misc_feature_shed misc_feature_ten_c
## 0 0 0
## misc_val mo_sold yr_sold
## 0 0 0
## sale_type_cod sale_type_con sale_type_con_ld
## 0 0 0
## sale_type_con_li sale_type_con_lw sale_type_cwd
## 0 0 0
## sale_type_new sale_type_oth sale_type_wd
## 0 0 0
## sale_condition_abnorml sale_condition_adj_land sale_condition_alloca
## 0 0 0
## sale_condition_family sale_condition_normal sale_condition_partial
## 0 0 0
## sale_price train log_sale_price
## 1459 0 1459
## log_gr_liv_area
## 0
# Split the data into training and testing sets
train <- ames_dummy[ames_dummy$train == 1, ]
test <- ames_dummy[ames_dummy$train == 0, ]
We can write as much or as little about this as we’d like. Basically, we tried to interpret what the NA’s actually were saying and then make a more informed guess as to what the were rather than going with the mean. In most cases, this created a new category of “none”. Remaining values were imputed to the mean, and the data was split into training and testing sets.
talk about feature selection methods
# Forward Selection
# # Testing olsrr method.
# library(olsrr)
# fit2x <- lm(log_sale_price ~ . - sale_price, data = train)
# fit2y <- ols_step_forward_p(fit2x, penter = 0.15)$model
# summary(fit2y)
# defaultSummary(data.frame(pred = predict(fit2y), obs = train$log_sale_price))
# PRESS(fit2y)
# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_forwards.rds")) {
# Load the model object from disk
fit2 <- readRDS("Models/lm_forwards.rds")
} else {
# Perform stepwise selection
# Set up a parallel backend with the number of cores you want to use
cores <- 8 # Change this to the number of cores you want to use
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
set.seed(137)
ctrl <- trainControl(
method = "boot",
number = 5,
allowParallel = TRUE
)
fit2 <- train(log_sale_price ~ . - sale_price,
data = train,
method = "glmStepAIC",
trControl = ctrl,
direction = "forward",
penter = 0.05 # Not Working.
)
# Stop the parallel backend
stopCluster(cl)
# Save the model object to disk
saveRDS(fit2, "Models/lm_forwards.rds")
}
summary(fit2$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.71103 -0.04529 0.00088 0.05269 0.60423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.575e+00 5.641e-01 4.565 5.45e-06 ***
## overall_qual 3.809e-02 3.940e-03 9.668 < 2e-16 ***
## log_gr_liv_area 3.724e-01 4.096e-02 9.091 < 2e-16 ***
## year_built 2.109e-03 2.175e-04 9.699 < 2e-16 ***
## overall_cond 4.115e-02 3.341e-03 12.317 < 2e-16 ***
## total_bsmt_sf 1.388e-04 1.218e-05 11.389 < 2e-16 ***
## roof_matl_cly_tile -2.536e+00 1.522e-01 -16.671 < 2e-16 ***
## bsmt_unf_sf -5.867e-05 1.210e-05 -4.848 1.39e-06 ***
## garage_cars 2.679e-02 8.629e-03 3.104 0.001946 **
## bldg_type1fam 2.699e-02 1.062e-02 2.541 0.011163 *
## condition2pos_n -7.879e-01 7.940e-02 -9.923 < 2e-16 ***
## neighborhood_crawfor 1.269e-01 1.617e-02 7.851 8.32e-15 ***
## kitchen_qual_ex 7.455e-02 1.341e-02 5.561 3.22e-08 ***
## ms_zoning_c_all -4.215e-01 3.485e-02 -12.096 < 2e-16 ***
## lot_area 2.766e-06 3.941e-07 7.019 3.52e-12 ***
## sale_condition_abnorml -2.802e-02 1.967e-02 -1.424 0.154564
## heating_qc_ex 2.760e-02 7.032e-03 3.925 9.12e-05 ***
## ms_zoning_rm -6.218e-02 9.787e-03 -6.353 2.87e-10 ***
## sale_type_new 8.657e-02 2.039e-02 4.247 2.31e-05 ***
## functional_typ 5.328e-02 1.270e-02 4.194 2.92e-05 ***
## condition1norm 4.382e-02 9.923e-03 4.416 1.08e-05 ***
## bsmt_exposure_gd 4.001e-02 1.154e-02 3.468 0.000541 ***
## fireplace_qu_none -5.418e-03 1.341e-02 -0.404 0.686325
## foundation_p_conc 2.307e-02 8.838e-03 2.610 0.009152 **
## garage_area 9.963e-05 2.966e-05 3.359 0.000803 ***
## neighborhood_stone_br 1.299e-01 2.225e-02 5.837 6.63e-09 ***
## exterior1st_brk_face 8.696e-02 2.069e-02 4.203 2.81e-05 ***
## gr_liv_area 3.839e-05 2.666e-05 1.440 0.150135
## neighborhood_nridg_ht 7.552e-02 1.546e-02 4.886 1.15e-06 ***
## screen_porch 2.251e-04 5.082e-05 4.430 1.02e-05 ***
## wood_deck_sf 9.460e-05 2.412e-05 3.921 9.24e-05 ***
## neighborhood_somerst 5.219e-02 1.319e-02 3.958 7.95e-05 ***
## year_remod_add 8.439e-04 2.041e-04 4.134 3.78e-05 ***
## neighborhood_brk_side 4.458e-02 1.492e-02 2.987 0.002868 **
## heating_grav -1.573e-01 4.092e-02 -3.845 0.000126 ***
## neighborhood_no_ridge 7.181e-02 1.903e-02 3.774 0.000168 ***
## bsmt_full_bath 2.620e-02 7.358e-03 3.561 0.000382 ***
## functional_maj2 -2.728e-01 4.969e-02 -5.490 4.80e-08 ***
## neighborhood_edwards -5.540e-02 1.159e-02 -4.779 1.96e-06 ***
## garage_cond_fa -5.952e-02 1.844e-02 -3.228 0.001276 **
## pool_qc_gd 2.621e-01 7.277e-02 3.602 0.000327 ***
## sale_type_con_ld 1.134e-01 3.499e-02 3.241 0.001219 **
## sale_condition_normal 3.493e-02 1.715e-02 2.036 0.041923 *
## functional_sev -2.584e-01 1.055e-01 -2.449 0.014455 *
## bldg_type_twnhs -5.236e-02 1.856e-02 -2.821 0.004857 **
## kitchen_abv_gr -5.459e-02 1.648e-02 -3.313 0.000947 ***
## bsmt_qual_ex 4.047e-02 1.318e-02 3.070 0.002186 **
## neighborhood_meadow_v -9.322e-02 2.765e-02 -3.371 0.000770 ***
## lot_config_cul_d_sac 4.125e-02 1.171e-02 3.522 0.000443 ***
## lot_frontage 4.000e-04 1.694e-04 2.361 0.018364 *
## roof_matl_wd_shngl 6.308e-02 4.722e-02 1.336 0.181812
## condition2pos_a 4.246e-01 1.038e-01 4.091 4.55e-05 ***
## foundation_stone 1.064e-01 4.250e-02 2.504 0.012408 *
## central_air_n -4.767e-02 1.387e-02 -3.437 0.000606 ***
## neighborhood_mitchel -3.672e-02 1.558e-02 -2.356 0.018590 *
## bsmt_fin_type2blq -3.193e-02 1.963e-02 -1.627 0.103975
## functional_mod -8.554e-02 3.012e-02 -2.840 0.004585 **
## heating_gas_w 7.884e-02 2.671e-02 2.952 0.003215 **
## exterior1st_brk_comm -1.953e-01 7.560e-02 -2.583 0.009903 **
## bsmt_exposure_no -1.338e-02 6.722e-03 -1.990 0.046788 *
## land_slope_sev -2.409e-01 4.418e-02 -5.452 5.90e-08 ***
## roof_matl_membran 3.748e-01 1.131e-01 3.315 0.000940 ***
## neighborhood_clear_cr 2.828e-02 2.227e-02 1.270 0.204361
## exterior1st_wd_sdng -2.363e-02 8.896e-03 -2.656 0.007992 **
## garage_qual_ex 4.202e-01 1.218e-01 3.449 0.000579 ***
## electrical_s_brkr -1.794e-02 1.107e-02 -1.621 0.105243
## condition1artery -5.343e-02 1.815e-02 -2.943 0.003300 **
## condition1rr_ae -7.550e-02 3.219e-02 -2.345 0.019163 *
## garage_type2types -1.093e-01 4.346e-02 -2.516 0.011999 *
## house_style1_5unf 5.291e-02 2.876e-02 1.840 0.065989 .
## exterior1st_hd_board -2.232e-02 8.405e-03 -2.656 0.008000 **
## enclosed_porch 9.642e-05 5.029e-05 1.917 0.055413 .
## roof_matl_metal 2.827e-01 1.103e-01 2.563 0.010474 *
## foundation_wood -1.260e-01 5.903e-02 -2.135 0.032920 *
## bsmt_fin_type2unf 1.703e-02 9.223e-03 1.847 0.064981 .
## fence_gd_wo -3.274e-02 1.437e-02 -2.279 0.022824 *
## exterior2nd_brk_face -5.595e-02 2.823e-02 -1.982 0.047685 *
## exter_cond_ta 2.110e-02 8.839e-03 2.388 0.017090 *
## bsmt_cond_po 1.607e-01 8.073e-02 1.991 0.046661 *
## garage_cond_ex -3.224e-01 1.414e-01 -2.280 0.022779 *
## bsmt_fin_type1unf -1.600e-02 9.163e-03 -1.747 0.080931 .
## house_style2_5fin -5.933e-02 4.102e-02 -1.447 0.148258
## lot_config_corner 1.226e-02 7.241e-03 1.693 0.090662 .
## condition1pos_n 4.469e-02 2.666e-02 1.677 0.093860 .
## sale_type_con 1.145e-01 7.209e-02 1.589 0.112342
## land_slope_gtl -2.275e-02 1.416e-02 -1.607 0.108371
## sale_type_cwd 8.159e-02 5.160e-02 1.581 0.114034
## mas_vnr_type_brk_cmn -4.944e-02 2.777e-02 -1.780 0.075248 .
## pool_qc_ex 1.254e-01 7.882e-02 1.591 0.111789
## mas_vnr_area 2.904e-05 1.868e-05 1.554 0.120350
## roof_style_shed 3.091e-01 1.096e-01 2.821 0.004864 **
## condition2rr_ae -3.422e-01 1.506e-01 -2.273 0.023191 *
## fireplaces 1.600e-02 1.059e-02 1.510 0.131179
## exterior2nd_plywood -1.478e-02 1.018e-02 -1.452 0.146825
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.00988336)
##
## Null deviance: 232.801 on 1459 degrees of freedom
## Residual deviance: 13.501 on 1366 degrees of freedom
## AIC: -2504.5
##
## Number of Fisher Scoring iterations: 2
defaultSummary(data.frame(pred = predict(fit2), obs = train$log_sale_price))
## RMSE Rsquared MAE
## 0.09616150 0.94200760 0.06774188
PRESS(fit2$finalModel) #Press not working with caret models
## [1] Inf
varImp(fit2$finalModel)%>%
filter(Overall > 4) %>%
arrange(desc(Overall))
## Overall
## roof_matl_cly_tile 16.670625
## overall_cond 12.316543
## ms_zoning_c_all 12.095573
## total_bsmt_sf 11.388951
## condition2pos_n 9.922932
## year_built 9.699077
## overall_qual 9.667944
## log_gr_liv_area 9.091300
## neighborhood_crawfor 7.850502
## lot_area 7.018695
## ms_zoning_rm 6.353041
## neighborhood_stone_br 5.836962
## kitchen_qual_ex 5.560925
## functional_maj2 5.489575
## land_slope_sev 5.452246
## neighborhood_nridg_ht 4.886112
## bsmt_unf_sf 4.847542
## neighborhood_edwards 4.778576
## screen_porch 4.430030
## condition1norm 4.416121
## sale_type_new 4.246877
## exterior1st_brk_face 4.202900
## functional_typ 4.193654
## year_remod_add 4.134330
## condition2pos_a 4.090827
# Output the predictions for the test set to a csv file
# fit2x <- glm(formula = formula(fit2), data = train)
forward_pred <- predict(fit2$finalModel, test)
forward_pred %>%
data.frame() %>%
rownames_to_column(var = "id") %>%
mutate(SalePrice = exp(forward_pred)) %>%
dplyr::select(id, SalePrice) %>%
write_csv("Predictions/forward_predictions.csv")
Forward: AIC -2504, RMSE: .096, R2: 0.942. Adjusted R2 is probably in there, but I’m not sure how to find it.
# Backwards Selection
# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_backwards.rds")) {
# Load the model object from disk
fit3 <- readRDS("Models/lm_backwards.rds")
} else {
# Perform stepwise selection
# Set up a parallel backend with the number of cores you want to use
cores <- 8 # Change this to the number of cores you want to use
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
set.seed(137)
fit3 <- train(log_sale_price ~ . - sale_price,
data = train,
method = "glmStepAIC",
trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE),
direction = "backward",
penter = 0.05 # Not Working.
)
# Stop the parallel backend
stopCluster(cl)
# Save the model object to disk
saveRDS(fit3, "Models/lm_backwards.rds")
}
summary(fit3$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.67457 -0.04458 0.00227 0.04964 0.56677
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.883e+00 6.580e-01 5.902 4.55e-09 ***
## ms_sub_class -4.289e-04 1.145e-04 -3.745 0.000188 ***
## ms_zoning_c_all -3.520e-01 3.882e-02 -9.066 < 2e-16 ***
## ms_zoning_fv 7.715e-02 1.916e-02 4.027 5.96e-05 ***
## ms_zoning_rl 3.115e-02 1.224e-02 2.545 0.011029 *
## lot_frontage 3.672e-04 1.754e-04 2.094 0.036471 *
## lot_area 2.977e-06 4.236e-07 7.028 3.33e-12 ***
## street_grvl -9.790e-02 4.579e-02 -2.138 0.032720 *
## lot_shape_ir2 2.714e-02 1.725e-02 1.573 0.116000
## land_contour_bnk -2.563e-02 1.453e-02 -1.763 0.078094 .
## land_contour_low -3.702e-02 2.221e-02 -1.667 0.095824 .
## utilities_all_pub 1.908e-01 1.028e-01 1.856 0.063689 .
## lot_config_corner 1.345e-02 7.317e-03 1.838 0.066279 .
## lot_config_cul_d_sac 3.677e-02 1.203e-02 3.057 0.002277 **
## lot_config_fr2 -2.216e-02 1.555e-02 -1.425 0.154307
## land_slope_gtl 2.234e-01 4.614e-02 4.842 1.44e-06 ***
## land_slope_mod 2.558e-01 4.641e-02 5.513 4.24e-08 ***
## neighborhood_br_dale -6.804e-02 3.309e-02 -2.056 0.039990 *
## neighborhood_collg_cr -2.287e-02 1.203e-02 -1.901 0.057538 .
## neighborhood_crawfor 9.867e-02 1.722e-02 5.732 1.23e-08 ***
## neighborhood_edwards -8.963e-02 1.338e-02 -6.698 3.11e-11 ***
## neighborhood_gilbert -2.202e-02 1.503e-02 -1.466 0.142963
## neighborhood_idotrr -5.114e-02 2.366e-02 -2.161 0.030861 *
## neighborhood_meadow_v -1.507e-01 2.989e-02 -5.041 5.28e-07 ***
## neighborhood_mitchel -6.355e-02 1.686e-02 -3.770 0.000170 ***
## neighborhood_n_ames -4.350e-02 1.164e-02 -3.736 0.000195 ***
## neighborhood_no_ridge 4.911e-02 1.973e-02 2.490 0.012913 *
## neighborhood_nridg_ht 6.064e-02 1.609e-02 3.768 0.000172 ***
## neighborhood_nw_ames -4.864e-02 1.524e-02 -3.192 0.001446 **
## neighborhood_old_town -5.997e-02 1.632e-02 -3.673 0.000249 ***
## neighborhood_sawyer -3.586e-02 1.499e-02 -2.392 0.016878 *
## neighborhood_stone_br 1.166e-01 2.251e-02 5.180 2.56e-07 ***
## condition1artery -9.408e-02 1.666e-02 -5.648 1.99e-08 ***
## condition1feedr -4.539e-02 1.246e-02 -3.641 0.000282 ***
## condition1pos_a -5.644e-02 3.771e-02 -1.496 0.134772
## condition1rr_ae -1.374e-01 3.161e-02 -4.345 1.50e-05 ***
## condition2pos_a 3.256e-01 1.066e-01 3.053 0.002311 **
## condition2pos_n -7.624e-01 7.411e-02 -10.288 < 2e-16 ***
## condition2rr_ae -4.546e-01 1.522e-01 -2.988 0.002864 **
## bldg_type2fm_con 5.721e-02 2.574e-02 2.222 0.026418 *
## bldg_type_twnhs -3.499e-02 2.020e-02 -1.732 0.083469 .
## house_style1_5fin -2.753e-02 1.378e-02 -1.997 0.046028 *
## house_style1_5unf 4.888e-02 2.922e-02 1.673 0.094655 .
## house_style2_5fin -1.038e-01 4.435e-02 -2.340 0.019449 *
## house_style2story -5.251e-02 1.540e-02 -3.409 0.000672 ***
## house_style_s_foyer 2.854e-02 1.967e-02 1.451 0.147089
## overall_qual 3.638e-02 4.021e-03 9.048 < 2e-16 ***
## overall_cond 3.993e-02 3.472e-03 11.498 < 2e-16 ***
## year_built 1.773e-03 2.662e-04 6.663 3.91e-11 ***
## year_remod_add 7.281e-04 2.150e-04 3.387 0.000727 ***
## roof_style_flat -3.546e-01 1.126e-01 -3.150 0.001668 **
## roof_style_gable -3.568e-01 1.105e-01 -3.228 0.001277 **
## roof_style_gambrel -3.561e-01 1.151e-01 -3.093 0.002023 **
## roof_style_hip -3.557e-01 1.108e-01 -3.212 0.001352 **
## roof_style_mansard -3.184e-01 1.170e-01 -2.721 0.006598 **
## roof_matl_cly_tile -2.185e+00 1.893e-01 -11.543 < 2e-16 ***
## roof_matl_membran 4.269e-01 1.168e-01 3.655 0.000268 ***
## roof_matl_metal 3.530e-01 1.127e-01 3.133 0.001769 **
## exterior1st_brk_comm -1.897e-01 7.826e-02 -2.424 0.015495 *
## exterior1st_brk_face 9.871e-02 2.080e-02 4.745 2.31e-06 ***
## exterior1st_wd_sdng -2.317e-02 8.947e-03 -2.589 0.009717 **
## exterior2nd_asb_shng -4.292e-02 2.426e-02 -1.769 0.077070 .
## exterior2nd_brk_face -6.603e-02 2.836e-02 -2.328 0.020037 *
## exterior2nd_hd_board -2.396e-02 9.108e-03 -2.630 0.008633 **
## exterior2nd_plywood -2.385e-02 1.093e-02 -2.183 0.029215 *
## mas_vnr_type_brk_cmn -4.568e-02 2.797e-02 -1.633 0.102662
## mas_vnr_area 3.606e-05 1.946e-05 1.853 0.064166 .
## exter_cond_fa -4.075e-02 2.302e-02 -1.770 0.076924 .
## exter_cond_gd -2.126e-02 9.648e-03 -2.204 0.027701 *
## foundation_brk_til 1.267e-01 6.062e-02 2.090 0.036785 *
## foundation_c_block 1.477e-01 5.966e-02 2.476 0.013409 *
## foundation_p_conc 1.623e-01 5.925e-02 2.739 0.006239 **
## foundation_slab 1.270e-01 6.523e-02 1.947 0.051733 .
## foundation_stone 2.217e-01 7.402e-02 2.994 0.002800 **
## bsmt_qual_ex 3.901e-02 1.336e-02 2.919 0.003571 **
## bsmt_cond_po 2.904e-01 1.187e-01 2.446 0.014582 *
## bsmt_exposure_av 1.922e-02 8.624e-03 2.229 0.025974 *
## bsmt_exposure_gd 5.220e-02 1.166e-02 4.476 8.24e-06 ***
## bsmt_fin_type1glq 1.744e-02 8.675e-03 2.011 0.044541 *
## bsmt_fin_sf1 1.412e-04 1.597e-05 8.842 < 2e-16 ***
## bsmt_fin_type2blq -3.959e-02 1.889e-02 -2.096 0.036245 *
## bsmt_fin_sf2 1.250e-04 2.243e-05 5.574 3.00e-08 ***
## bsmt_unf_sf 8.153e-05 1.433e-05 5.688 1.57e-08 ***
## heating_floor -1.470e-01 1.062e-01 -1.384 0.166597
## heating_gas_a -4.891e-02 2.479e-02 -1.973 0.048689 *
## heating_grav -2.057e-01 4.845e-02 -4.244 2.34e-05 ***
## heating_qc_ex 2.554e-02 7.192e-03 3.552 0.000396 ***
## central_air_n -4.817e-02 1.474e-02 -3.267 0.001113 **
## electrical_mix -2.741e-01 1.709e-01 -1.603 0.109070
## x2nd_flr_sf 6.189e-05 1.964e-05 3.150 0.001667 **
## bsmt_full_bath 2.530e-02 7.627e-03 3.317 0.000933 ***
## full_bath 1.444e-02 8.842e-03 1.633 0.102765
## half_bath 1.566e-02 8.588e-03 1.823 0.068490 .
## kitchen_abv_gr -6.371e-02 1.633e-02 -3.902 0.000100 ***
## kitchen_qual_ex 7.942e-02 1.339e-02 5.930 3.86e-09 ***
## functional_maj1 -6.194e-02 2.986e-02 -2.075 0.038216 *
## functional_maj2 -3.103e-01 5.107e-02 -6.076 1.60e-09 ***
## functional_min1 -4.034e-02 1.949e-02 -2.069 0.038705 *
## functional_min2 -5.192e-02 1.883e-02 -2.758 0.005895 **
## functional_mod -1.444e-01 2.911e-02 -4.959 7.99e-07 ***
## functional_sev -3.109e-01 1.115e-01 -2.789 0.005359 **
## fireplaces 1.945e-02 5.429e-03 3.583 0.000351 ***
## garage_type2types -9.648e-02 4.396e-02 -2.195 0.028344 *
## garage_yr_blt -3.203e-04 2.231e-04 -1.436 0.151321
## garage_cars 2.131e-02 8.954e-03 2.380 0.017473 *
## garage_area 1.260e-04 3.119e-05 4.041 5.63e-05 ***
## garage_qual_ex 4.518e-01 1.135e-01 3.981 7.23e-05 ***
## garage_qual_fa -3.562e-02 1.938e-02 -1.838 0.066330 .
## garage_cond_ex -3.720e-01 1.342e-01 -2.772 0.005653 **
## garage_cond_fa -3.758e-02 2.122e-02 -1.771 0.076769 .
## garage_cond_po 6.509e-02 4.779e-02 1.362 0.173439
## wood_deck_sf 9.315e-05 2.399e-05 3.884 0.000108 ***
## enclosed_porch 8.231e-05 5.040e-05 1.633 0.102699
## screen_porch 2.401e-04 5.211e-05 4.609 4.44e-06 ***
## pool_area 1.446e-03 6.946e-04 2.082 0.037559 *
## pool_qc_ex -6.779e-01 3.796e-01 -1.786 0.074312 .
## pool_qc_fa -8.767e-01 4.133e-01 -2.121 0.034081 *
## pool_qc_gd -6.705e-01 4.577e-01 -1.465 0.143214
## fence_gd_wo -3.508e-02 1.460e-02 -2.402 0.016436 *
## sale_type_con 1.146e-01 7.176e-02 1.596 0.110650
## sale_type_con_ld 1.336e-01 3.621e-02 3.689 0.000234 ***
## sale_type_cwd 8.893e-02 5.156e-02 1.725 0.084786 .
## sale_type_new 1.025e-01 1.486e-02 6.900 8.01e-12 ***
## sale_condition_adj_land 7.949e-02 5.470e-02 1.453 0.146406
## sale_condition_normal 5.248e-02 9.916e-03 5.292 1.41e-07 ***
## log_gr_liv_area 4.012e-01 2.362e-02 16.984 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.009762914)
##
## Null deviance: 232.801 on 1459 degrees of freedom
## Residual deviance: 13.024 on 1334 degrees of freedom
## AIC: -2493.1
##
## Number of Fisher Scoring iterations: 2
defaultSummary(data.frame(pred = predict(fit3), obs = train$log_sale_price))
## RMSE Rsquared MAE
## 0.09444766 0.94405631 0.06608136
PRESS(fit3$finalModel) # Press not working with caret models
## [1] Inf
varImp(fit3$finalModel)%>%
filter(Overall > 4) %>%
arrange(desc(Overall))
## Overall
## log_gr_liv_area 16.984408
## roof_matl_cly_tile 11.542834
## overall_cond 11.497800
## condition2pos_n 10.287568
## ms_zoning_c_all 9.065580
## overall_qual 9.047682
## bsmt_fin_sf1 8.842386
## lot_area 7.027994
## sale_type_new 6.900116
## neighborhood_edwards 6.697630
## year_built 6.663097
## functional_maj2 6.076291
## kitchen_qual_ex 5.929819
## neighborhood_crawfor 5.731806
## bsmt_unf_sf 5.688449
## condition1artery 5.647557
## bsmt_fin_sf2 5.574467
## land_slope_mod 5.512792
## sale_condition_normal 5.292479
## neighborhood_stone_br 5.180033
## neighborhood_meadow_v 5.040680
## functional_mod 4.959119
## land_slope_gtl 4.841711
## exterior1st_brk_face 4.744673
## screen_porch 4.608665
## bsmt_exposure_gd 4.476437
## condition1rr_ae 4.345298
## heating_grav 4.244451
## garage_area 4.040683
## ms_zoning_fv 4.027130
# Output the predictions for the test set to a csv file
# fit3x <- glm(formula = formula(fit3), data = train)
backward_pred <- predict(fit3$finalModel, newdata = test)
backward_pred %>%
data.frame() %>%
rownames_to_column(var = "id") %>%
mutate(SalePrice = exp(backward_pred)) %>%
dplyr::select(id, SalePrice) %>%
write_csv("Predictions/backward_predictions.csv")
Backward: AIC: -2493, RMSE: .0944, R2: 0.944
# Stepwise Selection
# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_stepwise.rds")) {
# Load the model object from disk
fit4 <- readRDS("Models/lm_stepwise.rds")
} else {
# Perform stepwise selection
# Set up a parallel backend with the number of cores you want to use
cores <- 8 # Change this to the number of cores you want to use
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
set.seed(137)
fit4 <- train(log_sale_price ~ . - sale_price,
data = train,
method = "glmStepAIC",
trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE),
direction = "both",
penter = 0.05 # Not Working.
)
# Stop the parallel backend
stopCluster(cl)
# Save the model object to disk
saveRDS(fit4, "Models/lm_stepwise.rds")
}
summary(fit4$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.67385 -0.04499 0.00226 0.04994 0.56163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.870e+00 6.587e-01 5.875 5.35e-09 ***
## ms_sub_class -4.289e-04 1.145e-04 -3.746 0.000188 ***
## ms_zoning_c_all -3.515e-01 3.879e-02 -9.062 < 2e-16 ***
## ms_zoning_fv 7.642e-02 1.914e-02 3.992 6.92e-05 ***
## ms_zoning_rl 2.943e-02 1.230e-02 2.394 0.016823 *
## lot_frontage 3.848e-04 1.754e-04 2.194 0.028443 *
## lot_area 2.899e-06 4.260e-07 6.807 1.51e-11 ***
## street_grvl -9.626e-02 4.576e-02 -2.104 0.035599 *
## lot_shape_ir2 2.804e-02 1.725e-02 1.625 0.104295
## land_contour_bnk -2.574e-02 1.452e-02 -1.772 0.076560 .
## land_contour_low -4.131e-02 2.234e-02 -1.849 0.064644 .
## utilities_all_pub 1.911e-01 1.027e-01 1.861 0.063023 .
## lot_config_corner 1.286e-02 7.315e-03 1.758 0.078980 .
## lot_config_cul_d_sac 3.778e-02 1.203e-02 3.141 0.001722 **
## lot_config_fr2 -2.215e-02 1.554e-02 -1.425 0.154248
## lot_config_fr3 -6.926e-02 5.109e-02 -1.355 0.175506
## land_slope_gtl 2.144e-01 4.640e-02 4.620 4.20e-06 ***
## land_slope_mod 2.476e-01 4.661e-02 5.313 1.26e-07 ***
## neighborhood_br_dale -6.773e-02 3.307e-02 -2.048 0.040755 *
## neighborhood_collg_cr -2.313e-02 1.202e-02 -1.924 0.054566 .
## neighborhood_crawfor 9.824e-02 1.720e-02 5.711 1.39e-08 ***
## neighborhood_edwards -9.012e-02 1.337e-02 -6.740 2.36e-11 ***
## neighborhood_gilbert -2.255e-02 1.502e-02 -1.502 0.133422
## neighborhood_idotrr -5.336e-02 2.369e-02 -2.253 0.024418 *
## neighborhood_meadow_v -1.517e-01 2.989e-02 -5.078 4.36e-07 ***
## neighborhood_mitchel -6.330e-02 1.684e-02 -3.758 0.000178 ***
## neighborhood_n_ames -4.368e-02 1.163e-02 -3.755 0.000181 ***
## neighborhood_no_ridge 4.883e-02 1.971e-02 2.477 0.013368 *
## neighborhood_nridg_ht 6.432e-02 1.619e-02 3.974 7.46e-05 ***
## neighborhood_nw_ames -4.963e-02 1.523e-02 -3.258 0.001152 **
## neighborhood_old_town -6.204e-02 1.636e-02 -3.792 0.000156 ***
## neighborhood_sawyer -3.616e-02 1.498e-02 -2.414 0.015899 *
## neighborhood_stone_br 1.183e-01 2.251e-02 5.255 1.72e-07 ***
## condition1artery -9.396e-02 1.664e-02 -5.646 2.01e-08 ***
## condition1feedr -4.637e-02 1.247e-02 -3.720 0.000207 ***
## condition1rr_ae -1.374e-01 3.158e-02 -4.350 1.46e-05 ***
## condition2pos_a 3.291e-01 1.066e-01 3.088 0.002057 **
## condition2pos_n -7.581e-01 7.408e-02 -10.233 < 2e-16 ***
## condition2rr_ae -4.540e-01 1.520e-01 -2.987 0.002871 **
## bldg_type2fm_con 5.696e-02 2.572e-02 2.214 0.026973 *
## bldg_type_twnhs -3.701e-02 2.021e-02 -1.831 0.067303 .
## house_style1_5fin -2.637e-02 1.379e-02 -1.913 0.056014 .
## house_style1_5unf 4.834e-02 2.920e-02 1.655 0.098144 .
## house_style2_5fin -1.017e-01 4.432e-02 -2.296 0.021855 *
## house_style2story -5.083e-02 1.541e-02 -3.298 0.000998 ***
## house_style_s_foyer 2.841e-02 1.965e-02 1.445 0.148553
## overall_qual 3.593e-02 4.027e-03 8.923 < 2e-16 ***
## overall_cond 4.004e-02 3.470e-03 11.541 < 2e-16 ***
## year_built 1.783e-03 2.662e-04 6.699 3.09e-11 ***
## year_remod_add 7.367e-04 2.148e-04 3.430 0.000623 ***
## roof_style_flat -3.563e-01 1.125e-01 -3.169 0.001566 **
## roof_style_gable -3.587e-01 1.104e-01 -3.248 0.001190 **
## roof_style_gambrel -3.589e-01 1.150e-01 -3.120 0.001847 **
## roof_style_hip -3.564e-01 1.107e-01 -3.221 0.001311 **
## roof_style_mansard -3.188e-01 1.169e-01 -2.727 0.006470 **
## roof_matl_cly_tile -2.170e+00 1.893e-01 -11.464 < 2e-16 ***
## roof_matl_membran 4.247e-01 1.167e-01 3.638 0.000285 ***
## roof_matl_metal 3.488e-01 1.126e-01 3.098 0.001989 **
## exterior1st_brk_comm -1.897e-01 7.819e-02 -2.426 0.015412 *
## exterior1st_brk_face 9.884e-02 2.078e-02 4.755 2.20e-06 ***
## exterior1st_wd_sdng -2.245e-02 8.945e-03 -2.509 0.012214 *
## exterior2nd_asb_shng -4.330e-02 2.424e-02 -1.787 0.074222 .
## exterior2nd_brk_face -6.849e-02 2.836e-02 -2.415 0.015875 *
## exterior2nd_hd_board -2.379e-02 9.100e-03 -2.614 0.009050 **
## exterior2nd_plywood -2.363e-02 1.092e-02 -2.165 0.030543 *
## mas_vnr_type_brk_cmn -4.461e-02 2.795e-02 -1.596 0.110721
## mas_vnr_area 3.470e-05 1.946e-05 1.783 0.074840 .
## exter_cond_fa -4.078e-02 2.300e-02 -1.773 0.076491 .
## exter_cond_gd -2.089e-02 9.644e-03 -2.166 0.030473 *
## foundation_brk_til 1.279e-01 6.056e-02 2.112 0.034893 *
## foundation_c_block 1.487e-01 5.961e-02 2.494 0.012758 *
## foundation_p_conc 1.632e-01 5.920e-02 2.757 0.005908 **
## foundation_slab 1.256e-01 6.517e-02 1.927 0.054188 .
## foundation_stone 2.226e-01 7.395e-02 3.010 0.002660 **
## bsmt_qual_ex 3.767e-02 1.337e-02 2.817 0.004914 **
## bsmt_cond_po 2.893e-01 1.187e-01 2.438 0.014890 *
## bsmt_exposure_av 1.999e-02 8.628e-03 2.317 0.020656 *
## bsmt_exposure_gd 5.066e-02 1.168e-02 4.338 1.55e-05 ***
## bsmt_fin_type1glq 1.752e-02 8.668e-03 2.021 0.043428 *
## bsmt_fin_sf1 1.396e-04 1.598e-05 8.737 < 2e-16 ***
## bsmt_fin_type2blq -3.971e-02 1.887e-02 -2.104 0.035542 *
## bsmt_fin_sf2 1.236e-04 2.242e-05 5.515 4.19e-08 ***
## bsmt_unf_sf 7.977e-05 1.436e-05 5.556 3.32e-08 ***
## heating_floor -1.526e-01 1.061e-01 -1.438 0.150690
## heating_gas_a -5.058e-02 2.479e-02 -2.040 0.041498 *
## heating_grav -2.070e-01 4.842e-02 -4.275 2.04e-05 ***
## heating_qc_ex 2.629e-02 7.194e-03 3.654 0.000268 ***
## central_air_n -4.895e-02 1.474e-02 -3.322 0.000918 ***
## electrical_mix -2.711e-01 1.708e-01 -1.587 0.112792
## x2nd_flr_sf 5.997e-05 1.965e-05 3.052 0.002315 **
## bsmt_full_bath 2.547e-02 7.622e-03 3.342 0.000856 ***
## full_bath 1.567e-02 8.856e-03 1.769 0.077108 .
## half_bath 1.527e-02 8.590e-03 1.777 0.075740 .
## kitchen_abv_gr -6.334e-02 1.632e-02 -3.882 0.000109 ***
## kitchen_qual_ex 7.973e-02 1.338e-02 5.958 3.26e-09 ***
## functional_maj1 -6.256e-02 2.983e-02 -2.097 0.036152 *
## functional_maj2 -3.114e-01 5.102e-02 -6.103 1.36e-09 ***
## functional_min1 -4.067e-02 1.947e-02 -2.089 0.036930 *
## functional_min2 -5.207e-02 1.881e-02 -2.769 0.005708 **
## functional_mod -1.442e-01 2.909e-02 -4.956 8.10e-07 ***
## functional_sev -3.118e-01 1.114e-01 -2.800 0.005180 **
## fireplaces 1.968e-02 5.430e-03 3.624 0.000301 ***
## garage_type2types -9.617e-02 4.392e-02 -2.190 0.028724 *
## garage_yr_blt -3.249e-04 2.232e-04 -1.456 0.145706
## garage_cars 2.158e-02 8.946e-03 2.412 0.015995 *
## garage_area 1.255e-04 3.116e-05 4.027 5.98e-05 ***
## garage_qual_ex 3.728e-01 1.229e-01 3.032 0.002474 **
## garage_qual_fa -3.683e-02 1.941e-02 -1.897 0.058026 .
## garage_cond_ex -2.936e-01 1.421e-01 -2.066 0.039012 *
## garage_cond_fa -3.443e-02 2.133e-02 -1.615 0.106608
## garage_cond_po 6.682e-02 4.776e-02 1.399 0.162004
## wood_deck_sf 8.875e-05 2.412e-05 3.680 0.000242 ***
## enclosed_porch 8.528e-05 5.039e-05 1.692 0.090813 .
## screen_porch 2.428e-04 5.208e-05 4.662 3.44e-06 ***
## pool_area 1.455e-03 6.940e-04 2.096 0.036241 *
## pool_qc_ex -6.816e-01 3.792e-01 -1.797 0.072492 .
## pool_qc_fa -8.811e-01 4.129e-01 -2.134 0.033033 *
## pool_qc_gd -6.754e-01 4.573e-01 -1.477 0.139923
## fence_gd_wo -3.495e-02 1.459e-02 -2.396 0.016726 *
## sale_type_con 1.165e-01 7.171e-02 1.625 0.104492
## sale_type_con_ld 1.345e-01 3.618e-02 3.719 0.000208 ***
## sale_type_cwd 8.900e-02 5.151e-02 1.728 0.084270 .
## sale_type_new 1.020e-01 1.486e-02 6.863 1.03e-11 ***
## sale_condition_adj_land 8.051e-02 5.466e-02 1.473 0.140988
## sale_condition_normal 5.254e-02 9.907e-03 5.303 1.33e-07 ***
## log_gr_liv_area 4.012e-01 2.360e-02 16.999 < 2e-16 ***
## roof_matl_wd_shngl 7.825e-02 4.723e-02 1.657 0.097822 .
## condition1pos_a -5.642e-02 3.768e-02 -1.497 0.134543
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.009744362)
##
## Null deviance: 232.801 on 1459 degrees of freedom
## Residual deviance: 12.979 on 1332 degrees of freedom
## AIC: -2494
##
## Number of Fisher Scoring iterations: 2
defaultSummary(data.frame(pred = predict(fit4), obs = train$log_sale_price))
## RMSE Rsquared MAE
## 0.09428712 0.94424633 0.06603357
PRESS(fit4$finalModel) # Press not working with caret models
## [1] Inf
varImp(fit4$finalModel) %>%
filter(Overall > 4) %>%
arrange(desc(Overall))
## Overall
## log_gr_liv_area 16.998839
## overall_cond 11.540682
## roof_matl_cly_tile 11.463697
## condition2pos_n 10.233332
## ms_zoning_c_all 9.061792
## overall_qual 8.923294
## bsmt_fin_sf1 8.736801
## sale_type_new 6.862606
## lot_area 6.806617
## neighborhood_edwards 6.739504
## year_built 6.698790
## functional_maj2 6.103470
## kitchen_qual_ex 5.958244
## neighborhood_crawfor 5.710567
## condition1artery 5.645616
## bsmt_unf_sf 5.556263
## bsmt_fin_sf2 5.514768
## land_slope_mod 5.312905
## sale_condition_normal 5.303034
## neighborhood_stone_br 5.254721
## neighborhood_meadow_v 5.077700
## functional_mod 4.956417
## exterior1st_brk_face 4.755398
## screen_porch 4.662298
## land_slope_gtl 4.620443
## condition1rr_ae 4.350094
## bsmt_exposure_gd 4.337773
## heating_grav 4.275294
## garage_area 4.026542
# Output the predictions for the test set to a csv file
# fit4x <- glm(formula = formula(fit4), data = train)
stepwise_pred <- predict(fit4$finalModel, newdata = test)
stepwise_pred %>%
data.frame() %>%
rownames_to_column(var = "id") %>%
mutate(SalePrice = exp(stepwise_pred)) %>%
dplyr::select(id, SalePrice) %>%
write_csv("Predictions/stepwise_predictions.csv")
AIC: -2493, RMSE: .0944, R2: 0.944 (same as backward)
For custom features, we wil use the top ten parameters ranked by importance from the stepwise model. We expect that this model will not perform as well, but each parameter will be more explainable, and fitting should require much less compute. The question we hope to answer is whether the increase in performance is worth the cost for one of the brute-force models, and whether ten parameters is too few to capture the complexity of the data.
# Custom Feature Selection
top10 <- varImp(fit4$finalModel) %>%
filter(Overall > 4) %>%
arrange(desc(Overall)) %>%
head(10) %>%
rownames()
form <- as.formula(paste("log_sale_price ~", paste(top10, collapse = "+")))
fit5 <- lm(form, data = train)
summary(fit5)
##
## Call:
## lm(formula = form, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88743 -0.08720 0.01853 0.10421 0.62038
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.693e+00 1.100e-01 69.902 < 2e-16 ***
## log_gr_liv_area 4.401e-01 1.647e-02 26.712 < 2e-16 ***
## overall_cond 2.339e-02 3.851e-03 6.074 1.59e-09 ***
## roof_matl_cly_tile -2.526e+00 1.708e-01 -14.793 < 2e-16 ***
## condition2pos_n -9.189e-01 1.159e-01 -7.930 4.35e-15 ***
## ms_zoning_c_all -4.007e-01 5.160e-02 -7.765 1.53e-14 ***
## overall_qual 1.429e-01 4.195e-03 34.059 < 2e-16 ***
## bsmt_fin_sf1 1.989e-04 1.013e-05 19.635 < 2e-16 ***
## sale_type_new 1.788e-01 1.637e-02 10.924 < 2e-16 ***
## lot_area 3.665e-06 4.442e-07 8.250 3.53e-16 ***
## neighborhood_edwards -6.591e-02 1.718e-02 -3.837 0.00013 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1603 on 1449 degrees of freedom
## Multiple R-squared: 0.8401, Adjusted R-squared: 0.839
## F-statistic: 761.5 on 10 and 1449 DF, p-value: < 2.2e-16
defaultSummary(data.frame(pred = predict(fit5), obs = train$log_sale_price))
## RMSE Rsquared MAE
## 0.1596549 0.8401425 0.1198526
PRESS(fit5) # Press not working with caret models
## [1] Inf
# varImp(fit5$finalModel) %>%
# filter(Overall > 4) %>%
# arrange(desc(Overall))
# Output the predictions for the test set to a csv file
# fit4x <- glm(formula = formula(fit4), data = train)
custom_pred <- predict(fit5, newdata = test)
custom_pred %>%
data.frame() %>%
rownames_to_column(var = "id") %>%
mutate(SalePrice = exp(custom_pred)) %>%
dplyr::select(id, SalePrice) %>%
write_csv("Predictions/custom_predictions.csv")
AIC: -1190, RMSE: .16, R2: 0.84 (worse but more parsimonious)
Adj R2, Internal CV press, Kaggle score
Checking the assumptions of the forward model. The other models should be similar.
# Plot the studentized residuals using base R
plot(fit2$finalModel$fitted.values, fit2$finalModel$residuals, type = "p")
plot(fit2$finalModel$residuals)
hist(fit2$finalModel$residuals, freq = FALSE)
curve(dnorm(x, mean = mean(fit2$finalModel$residuals), sd = sd(fit2$finalModel$residuals)), add = TRUE, col = "blue")
There is no evidence of non-linearity, heteroscedasticity, or non-normality.
# Plot the residuals vs. the fitted values
fit2$finalModel %>%
plot()
There are several points with a cook’s d approximately 1 which could be
investigated. However, the results are consistent with the assumptions
of the linear model.
Conclusion text
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
# Required Libraries
library(tidyverse)
library(knitr)
library(kableExtra)
library(ggthemes)
library(caret)
library(janitor)
library(doParallel)
#library(e1071)
#library(class)
train <- read.csv("Data/train.csv")
test <- read.csv("Data/test.csv")
# Merge the data frames and add a column indicating whether they come from the train or test set
train$train <- 1
test$SalePrice <- NA
test$train <- 0
ames <- rbind(train, test)
# Verify data frame
head(ames)
str(ames)
summary(ames)
# Summarize NA's by column
ames %>%
summarise_all(funs(sum(is.na(.)))) %>%
gather(key = "Column", value = "NA_Count", -1) %>%
filter(NA_Count > 0) %>%
ggplot(aes(x = reorder(Column, NA_Count), y = NA_Count)) +
geom_col() +
coord_flip() +
theme_gdocs() +
labs(title = "Number of NA's by Column", x = "Column", y = "NA Count")
# Create a table of the missing NAs by column
ames %>%
summarise_all(funs(sum(is.na(.)))) %>%
gather(key = "Column", value = "NA_Count", -1) %>%
filter(NA_Count > 0) %>%
arrange(desc(NA_Count)) %>%
kable()
# Plot Sale Price vs. Gross Living Area colored by neighborhood, omitting rows where SalePrice is NA
ames %>%
filter(!is.na(SalePrice)) %>%
ggplot(aes(x = GrLivArea, y = SalePrice, color = Neighborhood)) +
geom_point() +
theme_gdocs() +
labs(title = "Sale Price vs. Gross Living Area by Neighborhood", x = "Gross Living Area", y = "Sale Price")
# Plot log(Sale Price) vs. log(Gross Living Area) colored by neighborhood, omitting rows where SalePrice is NA
ames %>%
filter(!is.na(SalePrice)) %>%
ggplot(aes(x = log(GrLivArea), y = log(SalePrice), color = Neighborhood)) +
geom_point() +
theme_gdocs() +
labs(
title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
x = "log(Gross Living Area)",
y = "log(Sale Price)"
)
# Create columns for log(SalePrice) and log(GrLivArea)
ames$logSalePrice <- log(ames$SalePrice)
ames$logGrLivArea <- log(ames$GrLivArea)
PRESS <- function(linear.model) {
#' calculate the predictive residuals
pr <- residuals(linear.model) / (1 - lm.influence(linear.model)$hat)
#' calculate the PRESS
PRESS <- sum(pr^2)
return(PRESS)
}
# Function for calculating PRESS
# Tom Hopper
# https://gist.github.com/tomhopper/8c204d978c4a0cbcb8c0
# Plot log(Sale Price) vs. log(Gross Living Area) colored by neighborhood, omitting rows where SalePrice is NA for only the neighborhoods of interest
century21 <-
ames %>%
filter(!is.na(SalePrice)) %>%
filter(Neighborhood %in% c("NAmes", "Edwards", "BrkSide"))
century21 %>%
ggplot(aes(x = logGrLivArea, y = logSalePrice, color = Neighborhood)) +
geom_point() +
theme_gdocs() +
labs(
title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
x = "log(Gross Living Area)",
y = "log(Sale Price)"
)
# Fit a linear model to the data
fit1x <- lm(logSalePrice ~ logGrLivArea + Neighborhood, data = century21)
summary(fit1x)
PRESS(fit1x)
# Fit a linear model to the data with interaction variables
fit1 <- lm(logSalePrice ~ logGrLivArea * Neighborhood, data = century21)
summary(fit1)
PRESS(fit1)
confint(fit1) %>% kable()
# Plot the data with the linear model superposed
century21 %>%
ggplot(aes(x = logGrLivArea, y = logSalePrice, color = Neighborhood)) +
geom_point() +
theme_gdocs() +
labs(
title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
x = "log(Gross Living Area)",
y = "log(Sale Price)"
) +
geom_smooth(
method = "lm", formula = y ~ x, se = FALSE, linewidth = 1,
data = data.frame(
logGrLivArea = century21$logGrLivArea,
Neighborhood = century21$Neighborhood,
logSalePrice = predict(fit1)
)
)
# # Print parameter estimate table nicely. Not working, needs debugging
# fit1 %>%
# summary() %>%
# {cbind(as.data.frame(coef(.)), .[["coefficients"]][, 2:4])} %>%
# setNames(c("Estimate", "Std. Error", "t-value", "Pr(>|t|)")) %>%
# rownames_to_column(var = "Term") %>%
# mutate(Term = ifelse(Term == "(Intercept)", "Intercept", Term)) %>%
# add_row(Term = "Adjusted R-squared", Estimate = round(.$adj.r.squared, 3), Std..Error = NA, `t-value` = NA, `Pr(>|t|)` = NA) %>%
# kable(digits = 3, align = "c") %>%
# kable_styling(full_width = FALSE)
# Plot the studentized residuals using base R
plot(fit1$fitted.values, fit1$residuals, type = "p")
plot(fit1$residuals)
# Decide which of these we like better ggplot or R
# Calculate studentized residuals
stud_res <- rstudent(fit1)
# Create a data frame with the studentized residuals
df <- data.frame(stud_res, logGrLivArea = model.frame(fit1)$logGrLivArea)
# Create a scatterplot of the studentized residuals
ggplot(df, aes(x = logGrLivArea, y = stud_res)) +
geom_point() +
labs(title = "Scatterplot of Studentized Residuals",
x = "Studentized Residuals",
y = "Frequency") +
theme_minimal()
# Create histogram with normal curve
ggplot(df, aes(x = stud_res)) +
geom_histogram(aes(y = ..density..), binwidth = 0.5, fill = "lightblue", color = "black") +
stat_function(fun = dnorm, args = list(mean = mean(df$stud_res), sd = sd(df$stud_res)), color = "blue", size = 1.2) +
labs(title = "Histogram of Studentized Residuals with Normal Curve",
x = "Studentized Residuals",
y = "Frequency") +
theme_minimal()
# Plot the residuals vs. the fitted values
fit1 %>%
plot()
# Data Cleaning
# If pool-related variables are NA, assume there is no pool and assign to 0
ames <- ames %>%
mutate(
PoolQC = ifelse(is.na(PoolQC), "None", PoolQC),
PoolArea = ifelse(is.na(PoolArea), 0, PoolArea),
)
# If garage-related variables are NA, assume there is no garage and assign to 0
ames <- ames %>%
mutate(
GarageType = ifelse(is.na(GarageType), "None", GarageType),
#GarageYrBlt = ifelse(is.na(GarageYrBlt), 0, GarageYrBlt), #These will be changed to the mean because of large year values
GarageFinish = ifelse(is.na(GarageFinish), "None", GarageFinish),
GarageCars = ifelse(is.na(GarageCars), 0, GarageCars),
GarageArea = ifelse(is.na(GarageArea), 0, GarageArea),
GarageQual = ifelse(is.na(GarageQual), "None", GarageQual),
GarageCond = ifelse(is.na(GarageCond), "None", GarageCond)
)
# If Bsmt-related variables are NA, assume there is no Bsmt and assign to 0
ames <- ames %>%
mutate(
BsmtQual = ifelse(is.na(BsmtQual), "None", BsmtQual),
BsmtCond = ifelse(is.na(BsmtCond), "None", BsmtCond),
BsmtExposure = ifelse(is.na(BsmtExposure), "None", BsmtExposure),
BsmtFinType1 = ifelse(is.na(BsmtFinType1), "None", BsmtFinType1),
BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), 0, BsmtFinSF1),
BsmtFinType2 = ifelse(is.na(BsmtFinType2), "None", BsmtFinType2),
BsmtFinSF2 = ifelse(is.na(BsmtFinSF2), 0, BsmtFinSF2),
BsmtUnfSF = ifelse(is.na(BsmtUnfSF), 0, BsmtUnfSF),
TotalBsmtSF = ifelse(is.na(TotalBsmtSF), 0, TotalBsmtSF)
)
# If Fence-related variables are NA, assume there is no Fence and assign to 0
ames <- ames %>%
mutate(
Fence = ifelse(is.na(Fence), "None", Fence),
)
# If Misc-related variables are NA, assume there is no Misc features and assign to 0
ames <- ames %>%
mutate(
MiscFeature = ifelse(is.na(MiscFeature), "None", MiscFeature),
)
# If Fireplace-related variables are NA, assume there is no Fireplace and assign to 0
ames <- ames %>%
mutate(
FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
)
# If Alley-related variables are NA, assume there is no Alley and assign to 0
ames <- ames %>%
mutate(
Alley = ifelse(is.na(Alley), "None", Alley),
)
# Summarize the amount of remaining NA's by column to check what's left
colSums(is.na(ames))
# Use the dummyVars() function to convert categorical variables into dummy variables
# Then use janitor::clean_names() to clean up the column names
dummy_model <- dummyVars(~ ., data = ames)
ames_dummy <- as.data.frame(predict(dummy_model, newdata = ames))
ames_dummy <- clean_names(ames_dummy)
# Fill in all remaining na values with the mean of the column
ames_dummy <- ames_dummy %>%
mutate(across(
c(-sale_price, -log_sale_price),
~ ifelse(is.na(.), mean(., na.rm = TRUE), .)
))
# Summarize the amount of remaining NA's by column
colSums(is.na(ames_dummy))
# Split the data into training and testing sets
train <- ames_dummy[ames_dummy$train == 1, ]
test <- ames_dummy[ames_dummy$train == 0, ]
# Forward Selection
# # Testing olsrr method.
# library(olsrr)
# fit2x <- lm(log_sale_price ~ . - sale_price, data = train)
# fit2y <- ols_step_forward_p(fit2x, penter = 0.15)$model
# summary(fit2y)
# defaultSummary(data.frame(pred = predict(fit2y), obs = train$log_sale_price))
# PRESS(fit2y)
# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_forwards.rds")) {
# Load the model object from disk
fit2 <- readRDS("Models/lm_forwards.rds")
} else {
# Perform stepwise selection
# Set up a parallel backend with the number of cores you want to use
cores <- 8 # Change this to the number of cores you want to use
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
set.seed(137)
ctrl <- trainControl(
method = "boot",
number = 5,
allowParallel = TRUE
)
fit2 <- train(log_sale_price ~ . - sale_price,
data = train,
method = "glmStepAIC",
trControl = ctrl,
direction = "forward",
penter = 0.05 # Not Working.
)
# Stop the parallel backend
stopCluster(cl)
# Save the model object to disk
saveRDS(fit2, "Models/lm_forwards.rds")
}
summary(fit2$finalModel)
defaultSummary(data.frame(pred = predict(fit2), obs = train$log_sale_price))
PRESS(fit2$finalModel) #Press not working with caret models
varImp(fit2$finalModel)%>%
filter(Overall > 4) %>%
arrange(desc(Overall))
# Output the predictions for the test set to a csv file
# fit2x <- glm(formula = formula(fit2), data = train)
forward_pred <- predict(fit2$finalModel, test)
forward_pred %>%
data.frame() %>%
rownames_to_column(var = "id") %>%
mutate(SalePrice = exp(forward_pred)) %>%
dplyr::select(id, SalePrice) %>%
write_csv("Predictions/forward_predictions.csv")
# Backwards Selection
# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_backwards.rds")) {
# Load the model object from disk
fit3 <- readRDS("Models/lm_backwards.rds")
} else {
# Perform stepwise selection
# Set up a parallel backend with the number of cores you want to use
cores <- 8 # Change this to the number of cores you want to use
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
set.seed(137)
fit3 <- train(log_sale_price ~ . - sale_price,
data = train,
method = "glmStepAIC",
trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE),
direction = "backward",
penter = 0.05 # Not Working.
)
# Stop the parallel backend
stopCluster(cl)
# Save the model object to disk
saveRDS(fit3, "Models/lm_backwards.rds")
}
summary(fit3$finalModel)
defaultSummary(data.frame(pred = predict(fit3), obs = train$log_sale_price))
PRESS(fit3$finalModel) # Press not working with caret models
varImp(fit3$finalModel)%>%
filter(Overall > 4) %>%
arrange(desc(Overall))
# Output the predictions for the test set to a csv file
# fit3x <- glm(formula = formula(fit3), data = train)
backward_pred <- predict(fit3$finalModel, newdata = test)
backward_pred %>%
data.frame() %>%
rownames_to_column(var = "id") %>%
mutate(SalePrice = exp(backward_pred)) %>%
dplyr::select(id, SalePrice) %>%
write_csv("Predictions/backward_predictions.csv")
# Stepwise Selection
# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_stepwise.rds")) {
# Load the model object from disk
fit4 <- readRDS("Models/lm_stepwise.rds")
} else {
# Perform stepwise selection
# Set up a parallel backend with the number of cores you want to use
cores <- 8 # Change this to the number of cores you want to use
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
set.seed(137)
fit4 <- train(log_sale_price ~ . - sale_price,
data = train,
method = "glmStepAIC",
trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE),
direction = "both",
penter = 0.05 # Not Working.
)
# Stop the parallel backend
stopCluster(cl)
# Save the model object to disk
saveRDS(fit4, "Models/lm_stepwise.rds")
}
summary(fit4$finalModel)
defaultSummary(data.frame(pred = predict(fit4), obs = train$log_sale_price))
PRESS(fit4$finalModel) # Press not working with caret models
varImp(fit4$finalModel) %>%
filter(Overall > 4) %>%
arrange(desc(Overall))
# Output the predictions for the test set to a csv file
# fit4x <- glm(formula = formula(fit4), data = train)
stepwise_pred <- predict(fit4$finalModel, newdata = test)
stepwise_pred %>%
data.frame() %>%
rownames_to_column(var = "id") %>%
mutate(SalePrice = exp(stepwise_pred)) %>%
dplyr::select(id, SalePrice) %>%
write_csv("Predictions/stepwise_predictions.csv")
# Custom Feature Selection
top10 <- varImp(fit4$finalModel) %>%
filter(Overall > 4) %>%
arrange(desc(Overall)) %>%
head(10) %>%
rownames()
form <- as.formula(paste("log_sale_price ~", paste(top10, collapse = "+")))
fit5 <- lm(form, data = train)
summary(fit5)
defaultSummary(data.frame(pred = predict(fit5), obs = train$log_sale_price))
PRESS(fit5) # Press not working with caret models
# varImp(fit5$finalModel) %>%
# filter(Overall > 4) %>%
# arrange(desc(Overall))
# Output the predictions for the test set to a csv file
# fit4x <- glm(formula = formula(fit4), data = train)
custom_pred <- predict(fit5, newdata = test)
custom_pred %>%
data.frame() %>%
rownames_to_column(var = "id") %>%
mutate(SalePrice = exp(custom_pred)) %>%
dplyr::select(id, SalePrice) %>%
write_csv("Predictions/custom_predictions.csv")
# Plot the studentized residuals using base R
plot(fit2$finalModel$fitted.values, fit2$finalModel$residuals, type = "p")
plot(fit2$finalModel$residuals)
hist(fit2$finalModel$residuals, freq = FALSE)
curve(dnorm(x, mean = mean(fit2$finalModel$residuals), sd = sd(fit2$finalModel$residuals)), add = TRUE, col = "blue")
# Plot the residuals vs. the fitted values
fit2$finalModel %>%
plot()